Imports

Imports¶

In [1]:
#imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

import shap

%matplotlib inline
Data Collection

Dataset reference:

  • Anselin, D. L., et al. (2003). Geoda. geodacenter.github.io.
In [200]:
data = pd.read_csv('src/kc_house_data.csv')
In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB
In [203]:
data.head()
Out[203]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

Data Processing
In [8]:
# drop date because it is irrelevant for our findings
data = data.drop(['date', 'id'],axis=1)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/var/folders/75/5mh9hp1s1xn2ypzg82bcvlnr0000gn/T/ipykernel_16086/3295968318.py in <module>
      1 # drop date because it is irrelevant for our findings
----> 2 data = data.drop(['date', 'id'],axis=1)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4904                 weight  1.0     0.8
   4905         """
-> 4906         return super().drop(
   4907             labels=labels,
   4908             axis=axis,

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4148         for axis, labels in axes.items():
   4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4151 
   4152         if inplace:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   4183                 new_axis = axis.drop(labels, level=level, errors=errors)
   4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
   4186             result = self.reindex(**{axis_name: new_axis})
   4187 

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   6015         if mask.any():
   6016             if errors != "ignore":
-> 6017                 raise KeyError(f"{labels[mask]} not found in axis")
   6018             indexer = indexer[~mask]
   6019         return self.delete(indexer)

KeyError: "['date' 'id'] not found in axis"
In [9]:
# Getting rid of price outliers
data = data.drop(data[data.price > 3887500.0].index) # rid of 12 incredibly high outliers
In [10]:
# Getting rid of bathroom outliers
data = data.drop(data[data.bedrooms > 15].index) 
In [11]:
# Since most of the yr_renovated is has 0 values, 
# the data will be transformed to boolean (yr_renovated to renovated)
data['yr_renovated'] = pd.Series(np.where(data.yr_renovated.values > 0, 1, 0),
          data.index)
data.rename(columns = {'yr_renovated':'renovated'}, inplace = True)
In [8]:
# Check out for outliers by printing histograms of all features
for feature in data.columns:
    print(feature)
    sns.histplot(data[feature])
    plt.show()
    print(data[feature].value_counts(bins=10, sort=False))
    print('Min', min(data[feature]))
    print('Max', max(data[feature]))
    print('\n\n')
price
(71224.999, 452500.0]     10912
(452500.0, 830000.0]       8054
(830000.0, 1207500.0]      1675
(1207500.0, 1585000.0]      520
(1585000.0, 1962500.0]      234
(1962500.0, 2340000.0]       86
(2340000.0, 2717500.0]       58
(2717500.0, 3095000.0]       31
(3095000.0, 3472500.0]       20
(3472500.0, 3850000.0]       10
Name: price, dtype: int64
Min 75000.0
Max 3850000.0



bedrooms
(-0.012, 1.1]     212
(1.1, 2.2]       2760
(2.2, 3.3]       9824
(3.3, 4.4]       6880
(4.4, 5.5]       1594
(5.5, 6.6]        269
(6.6, 7.7]         38
(7.7, 8.8]         13
(8.8, 9.9]          6
(9.9, 11.0]         4
Name: bedrooms, dtype: int64
Min 0
Max 11



bathrooms
(-0.009000000000000001, 0.8]      86
(0.8, 1.6]                      5307
(1.6, 2.4]                      7024
(2.4, 3.2]                      7317
(3.2, 4.0]                      1611
(4.0, 4.8]                       201
(4.8, 5.6]                        40
(5.6, 6.4]                         9
(6.4, 7.2]                         3
(7.2, 8.0]                         2
Name: bathrooms, dtype: int64
Min 0.0
Max 8.0



sqft_living
(276.749, 1615.0]      7570
(1615.0, 2940.0]      10713
(2940.0, 4265.0]       2780
(4265.0, 5590.0]        441
(5590.0, 6915.0]         75
(6915.0, 8240.0]         19
(8240.0, 9565.0]          1
(9565.0, 10890.0]         0
(10890.0, 12215.0]        0
(12215.0, 13540.0]        1
Name: sqft_living, dtype: int64
Min 290
Max 13540



sqft_lot
(-1130.84, 165603.9]      21290
(165603.9, 330687.8]        251
(330687.8, 495771.7]         37
(495771.7, 660855.6]         10
(660855.6, 825939.5]          1
(825939.5, 991023.4]          7
(991023.4, 1156107.3]         2
(1156107.3, 1321191.2]        1
(1321191.2, 1486275.1]        0
(1486275.1, 1651359.0]        1
Name: sqft_lot, dtype: int64
Min 520
Max 1651359



floors
(0.997, 1.25]    10678
(1.25, 1.5]       1910
(1.5, 1.75]          0
(1.75, 2.0]       8231
(2.0, 2.25]          0
(2.25, 2.5]        160
(2.5, 2.75]          0
(2.75, 3.0]        613
(3.0, 3.25]          0
(3.25, 3.5]          8
Name: floors, dtype: int64
Min 1.0
Max 3.5



waterfront
(-0.002, 0.1]    21442
(0.1, 0.2]           0
(0.2, 0.3]           0
(0.3, 0.4]           0
(0.4, 0.5]           0
(0.5, 0.6]           0
(0.6, 0.7]           0
(0.7, 0.8]           0
(0.8, 0.9]           0
(0.9, 1.0]         158
Name: waterfront, dtype: int64
Min 0
Max 1



view
(-0.005, 0.4]    19484
(0.4, 0.8]           0
(0.8, 1.2]         332
(1.2, 1.6]           0
(1.6, 2.0]         962
(2.0, 2.4]           0
(2.4, 2.8]           0
(2.8, 3.2]         509
(3.2, 3.6]           0
(3.6, 4.0]         313
Name: view, dtype: int64
Min 0
Max 4



condition
(0.995, 1.4]       30
(1.4, 1.8]          0
(1.8, 2.2]        172
(2.2, 2.6]          0
(2.6, 3.0]      14021
(3.0, 3.4]          0
(3.4, 3.8]          0
(3.8, 4.2]       5677
(4.2, 4.6]          0
(4.6, 5.0]       1700
Name: condition, dtype: int64
Min 1
Max 5



grade
(0.987, 2.2]        1
(2.2, 3.4]          3
(3.4, 4.6]         29
(4.6, 5.8]        242
(5.8, 7.0]      11018
(7.0, 8.2]       6068
(8.2, 9.4]       2615
(9.4, 10.6]      1134
(10.6, 11.8]      398
(11.8, 13.0]       92
Name: grade, dtype: int64
Min 1
Max 13



sqft_above
(280.879, 1202.0]    5620
(1202.0, 2114.0]     9988
(2114.0, 3026.0]     4065
(3026.0, 3938.0]     1483
(3938.0, 4850.0]      351
(4850.0, 5762.0]       66
(5762.0, 6674.0]       21
(6674.0, 7586.0]        2
(7586.0, 8498.0]        3
(8498.0, 9410.0]        1
Name: sqft_above, dtype: int64
Min 290
Max 9410



sqft_basement
(-4.131, 413.0]     15006
(413.0, 826.0]       3427
(826.0, 1239.0]      2256
(1239.0, 1652.0]      697
(1652.0, 2065.0]      159
(2065.0, 2478.0]       38
(2478.0, 2891.0]       14
(2891.0, 3304.0]        1
(3304.0, 3717.0]        1
(3717.0, 4130.0]        1
Name: sqft_basement, dtype: int64
Min 0
Max 4130



yr_built
(1899.884, 1911.5]     851
(1911.5, 1923.0]       952
(1923.0, 1934.5]      1079
(1934.5, 1946.0]      1360
(1946.0, 1957.5]      2586
(1957.5, 1969.0]      3218
(1969.0, 1980.5]      2525
(1980.5, 1992.0]      2782
(1992.0, 2003.5]      2658
(2003.5, 2015.0]      3589
Name: yr_built, dtype: int64
Min 1900
Max 2015



renovated
(-0.002, 0.1]    20689
(0.1, 0.2]           0
(0.2, 0.3]           0
(0.3, 0.4]           0
(0.4, 0.5]           0
(0.5, 0.6]           0
(0.6, 0.7]           0
(0.7, 0.8]           0
(0.8, 0.9]           0
(0.9, 1.0]         911
Name: renovated, dtype: int64
Min 0
Max 1



zipcode
(98000.80099999999, 98020.8]    2853
(98020.8, 98040.6]              4378
(98040.6, 98060.4]              3345
(98060.4, 98080.2]              1699
(98080.2, 98100.0]               351
(98100.0, 98119.8]              4257
(98119.8, 98139.6]              1811
(98139.6, 98159.4]              1133
(98159.4, 98179.2]              1040
(98179.2, 98199.0]               733
Name: zipcode, dtype: int64
Min 98001
Max 98199



lat
(47.154, 47.218]     181
(47.218, 47.28]      329
(47.28, 47.342]     1428
(47.342, 47.405]    1930
(47.405, 47.467]    1432
(47.467, 47.529]    2584
(47.529, 47.591]    3863
(47.591, 47.653]    2997
(47.653, 47.715]    4019
(47.715, 47.778]    2837
Name: lat, dtype: int64
Min 47.1559
Max 47.7776



long
(-122.521, -122.399]     419
(-122.399, -122.278]    8829
(-122.278, -122.158]    5634
(-122.158, -122.037]    3908
(-122.037, -121.917]    2091
(-121.917, -121.797]     483
(-121.797, -121.676]     217
(-121.676, -121.556]       2
(-121.556, -121.435]       2
(-121.435, -121.315]      15
Name: long, dtype: int64
Min -122.519
Max -121.315



sqft_living15
(393.18800000000005, 980.1]     312
(980.1, 1561.2]                6439
(1561.2, 2142.3]               7587
(2142.3, 2723.4]               4200
(2723.4, 3304.5]               2001
(3304.5, 3885.6]                728
(3885.6, 4466.7]                232
(4466.7, 5047.8]                 81
(5047.8, 5628.9]                 12
(5628.9, 6210.0]                  8
Name: sqft_living15, dtype: int64
Min 399
Max 6210



sqft_lot15
(-219.55, 87705.9]      21203
(87705.9, 174760.8]       196
(174760.8, 261815.7]      166
(261815.7, 348870.6]       21
(348870.6, 435925.5]       10
(435925.5, 522980.4]        1
(522980.4, 610035.3]        1
(610035.3, 697090.2]        0
(697090.2, 784145.1]        0
(784145.1, 871200.0]        2
Name: sqft_lot15, dtype: int64
Min 651
Max 871200



In [9]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21600 entries, 0 to 21612
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21600 non-null  float64
 1   bedrooms       21600 non-null  int64  
 2   bathrooms      21600 non-null  float64
 3   sqft_living    21600 non-null  int64  
 4   sqft_lot       21600 non-null  int64  
 5   floors         21600 non-null  float64
 6   waterfront     21600 non-null  int64  
 7   view           21600 non-null  int64  
 8   condition      21600 non-null  int64  
 9   grade          21600 non-null  int64  
 10  sqft_above     21600 non-null  int64  
 11  sqft_basement  21600 non-null  int64  
 12  yr_built       21600 non-null  int64  
 13  renovated      21600 non-null  int64  
 14  zipcode        21600 non-null  int64  
 15  lat            21600 non-null  float64
 16  long           21600 non-null  float64
 17  sqft_living15  21600 non-null  int64  
 18  sqft_lot15     21600 non-null  int64  
dtypes: float64(5), int64(14)
memory usage: 3.3 MB

Pearson correlation matrix

We use the Pearson correlation coefficient to examine the strength and direction of the linear relationship between two continuous variables.

The correlation coefficient can range in value from −1 to +1. The larger the absolute value of the coefficient, the stronger the relationship between the variables. For the Pearson correlation, an absolute value of 1 indicates a perfect linear relationship. A correlation close to 0 indicates no linear relationship between the variables.

The sign of the coefficient indicates the direction of the relationship. If both variables tend to increase or decrease together, the coefficient is positive, and the line that represents the correlation slopes upward. If one variable tends to increase as the other decreases, the coefficient is negative, and the line that represents the correlation slopes downward.

In [10]:
sns.set(style="whitegrid", font_scale=1)
plt.figure(figsize=(13,13))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(data.corr(),linewidths=0.25,vmax=0.7,square=True,cmap="GnBu",
            linecolor='w',annot=True, annot_kws={"size":7}, 
            cbar_kws={"shrink": .7})
Out[10]:
<AxesSubplot:title={'center':'Pearson Correlation Matrix'}>

Which features are more correlated to the price?

In [11]:
price_corr = data.corr()['price'].sort_values(ascending=False)
print(price_corr)
price            1.000000
sqft_living      0.694332
grade            0.677461
sqft_above       0.598753
sqft_living15    0.597792
bathrooms        0.520003
view             0.397511
lat              0.320394
bedrooms         0.317871
sqft_basement    0.312569
floors           0.264089
waterfront       0.248897
renovated        0.123182
sqft_lot         0.091994
sqft_lot15       0.084420
yr_built         0.054103
condition        0.040820
long             0.024028
zipcode         -0.051128
Name: price, dtype: float64

Discovered various outlier samples that have incredibly large square foot basements, while more than half the dataset tends to not even have a basement. Feature should be cleaned

In [12]:
plot = sns.boxplot(data['sqft_basement'])
Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
In [13]:
# check distribution grouped by bins
data['sqft_basement'].value_counts(bins=20, sort=False)
Out[13]:
(-4.131, 206.5]     13691
(206.5, 413.0]       1315
(413.0, 619.5]       1677
(619.5, 826.0]       1750
(826.0, 1032.5]      1477
(1032.5, 1239.0]      779
(1239.0, 1445.5]      477
(1445.5, 1652.0]      220
(1652.0, 1858.5]      109
(1858.5, 2065.0]       50
(2065.0, 2271.5]       32
(2271.5, 2478.0]        6
(2478.0, 2684.5]       10
(2684.5, 2891.0]        4
(2891.0, 3097.5]        0
(3097.5, 3304.0]        1
(3304.0, 3510.5]        1
(3510.5, 3717.0]        0
(3717.0, 3923.5]        0
(3923.5, 4130.0]        1
Name: sqft_basement, dtype: int64
In [14]:
sns.histplot(data['sqft_basement'])
Out[14]:
<AxesSubplot:xlabel='sqft_basement', ylabel='Count'>
In [15]:
# print an overview of the data relation with every pair of features
sns.pairplot(data)
Out[15]:
<seaborn.axisgrid.PairGrid at 0x7fc7604c55b0>

as longitud tends to decrease, that is go west, in King county, there appears to be more houses, and particularly houses with higher price. Similarly with the latitud as it increases, that means, as the house is more to the north, there are more houses with high price in the King County area in washington state.

In [16]:
sns.scatterplot(data['long'], data['price'])
Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Out[16]:
<AxesSubplot:xlabel='long', ylabel='price'>
In [17]:
sns.scatterplot(data['lat'], data['price'])
Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Out[17]:
<AxesSubplot:xlabel='lat', ylabel='price'>
Modeling

Splitting the data¶

In [124]:
X = data.drop('price', axis=1)
y = data['price']
In [125]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=0)
In [126]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(15120, 18)
(6480, 18)
(15120,)
(6480,)

Creating model¶

In [127]:
params = {'iterations':10000,
        'learning_rate':0.01,
        'depth':3,
        'loss_function':'RMSE',
        'eval_metric':'RMSE',
        'random_seed':55,
        #'cat_features':boston_categories,
        'metric_period':200,  
        'od_type':"Iter",  
        'od_wait':20,  
        'verbose':True,
        'use_best_model':True}
model = CatBoostRegressor(**params)

Training model¶

In [128]:
model.fit(X_train, y_train,
         eval_set=(X_test, y_test),
         use_best_model=True,
         plot=True,
         verbose=True)
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 345891.9480508	test: 345907.7598494	best: 345907.7598494 (0)	total: 65.9ms	remaining: 10m 59s
200:	learn: 189988.8717602	test: 190680.6683071	best: 190680.6683071 (200)	total: 454ms	remaining: 22.1s
400:	learn: 154776.2387828	test: 155286.2193118	best: 155286.2193118 (400)	total: 795ms	remaining: 19s
600:	learn: 140133.9061283	test: 141310.5918705	best: 141310.5918705 (600)	total: 1.16s	remaining: 18.1s
800:	learn: 132210.0940179	test: 134144.6337262	best: 134144.6337262 (800)	total: 1.5s	remaining: 17.3s
1000:	learn: 127181.4633115	test: 129958.1415207	best: 129958.1415207 (1000)	total: 1.85s	remaining: 16.6s
1200:	learn: 123359.3094657	test: 127261.7334717	best: 127261.7334717 (1200)	total: 2.19s	remaining: 16.1s
1400:	learn: 120007.6008026	test: 125203.4198809	best: 125203.4198809 (1400)	total: 2.51s	remaining: 15.4s
1600:	learn: 117386.9432710	test: 123751.2293790	best: 123751.2293790 (1600)	total: 2.83s	remaining: 14.9s
1800:	learn: 115380.6178454	test: 122672.3821423	best: 122667.8901137 (1799)	total: 3.21s	remaining: 14.6s
2000:	learn: 113579.9670289	test: 121642.4625666	best: 121642.4625666 (2000)	total: 3.56s	remaining: 14.2s
2200:	learn: 111892.9979306	test: 120710.9809830	best: 120710.9809830 (2200)	total: 3.9s	remaining: 13.8s
2400:	learn: 110331.6577205	test: 119856.9197229	best: 119851.6829093 (2398)	total: 4.24s	remaining: 13.4s
2600:	learn: 109038.3602758	test: 119128.3665563	best: 119128.3665563 (2600)	total: 4.58s	remaining: 13s
2800:	learn: 107639.0531884	test: 118289.9762116	best: 118289.9762116 (2800)	total: 4.95s	remaining: 12.7s
3000:	learn: 106401.8630716	test: 117658.9152103	best: 117658.9152103 (3000)	total: 5.3s	remaining: 12.4s
3200:	learn: 105287.0977579	test: 117151.8333360	best: 117151.8333360 (3200)	total: 5.64s	remaining: 12s
3400:	learn: 104185.4108282	test: 116649.4155660	best: 116649.4155660 (3400)	total: 5.99s	remaining: 11.6s
3600:	learn: 103183.0309551	test: 116192.1404032	best: 116192.1404032 (3600)	total: 6.33s	remaining: 11.2s
3800:	learn: 102230.0496211	test: 115776.5596362	best: 115776.5596362 (3800)	total: 6.67s	remaining: 10.9s
4000:	learn: 101337.7945761	test: 115441.5667786	best: 115441.5667786 (4000)	total: 7s	remaining: 10.5s
4200:	learn: 100510.7079492	test: 115090.2869713	best: 115089.8052618 (4199)	total: 7.35s	remaining: 10.1s
4400:	learn: 99721.1818495	test: 114771.7882240	best: 114768.3722578 (4396)	total: 7.69s	remaining: 9.78s
4600:	learn: 99013.3172802	test: 114476.7330777	best: 114476.5943900 (4598)	total: 8.03s	remaining: 9.42s
4800:	learn: 98315.0535317	test: 114225.0593788	best: 114222.4329617 (4792)	total: 8.37s	remaining: 9.07s
5000:	learn: 97633.1803013	test: 113977.8426713	best: 113977.4221000 (4999)	total: 8.72s	remaining: 8.71s
5200:	learn: 97000.2964890	test: 113755.0302323	best: 113754.3961384 (5199)	total: 9.05s	remaining: 8.35s
5400:	learn: 96405.5209028	test: 113565.1380152	best: 113565.1380152 (5400)	total: 9.4s	remaining: 8.01s
5600:	learn: 95851.6716772	test: 113355.8417090	best: 113355.8417090 (5600)	total: 9.75s	remaining: 7.65s
5800:	learn: 95294.3907150	test: 113116.3388004	best: 113115.2411821 (5798)	total: 10.1s	remaining: 7.3s
6000:	learn: 94734.5256658	test: 112817.6002707	best: 112817.6002707 (6000)	total: 10.4s	remaining: 6.95s
6200:	learn: 94238.1977740	test: 112568.6692028	best: 112568.6692028 (6200)	total: 10.8s	remaining: 6.59s
6400:	learn: 93709.2265475	test: 112312.6645411	best: 112312.6645411 (6400)	total: 11.1s	remaining: 6.24s
6600:	learn: 93252.8478167	test: 112132.3237084	best: 112131.0433792 (6593)	total: 11.4s	remaining: 5.89s
6800:	learn: 92783.6693648	test: 111906.9067528	best: 111906.0147754 (6797)	total: 11.8s	remaining: 5.54s
7000:	learn: 92322.1106012	test: 111781.4955484	best: 111775.1585714 (6980)	total: 12.1s	remaining: 5.19s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 111775.1586
bestIteration = 6980

Shrink model to first 6981 iterations.
Out[128]:
<catboost.core.CatBoostRegressor at 0x7fcc872d8fa0>
Model Evaluation

Metrics¶

$R^2$: compares models prediction to the mean of the targets. that is the expected value

In [129]:
# predictions on the test set
y_pred = model.predict(X_test)

print('MAE: ',mean_absolute_error(y_test,y_pred)) # sum(abs(y_test - y_pred) / len(y_test))
print('MSE: ',mean_squared_error(y_test,y_pred)) # sum((y_test - y_pred)**2 / len(y_test))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,y_pred)))
print('R^2 score', model.score(X_test,y_test)) # Calculates r^2
MAE:  67378.75256223505
MSE:  12493686073.656778
RMSE:  111775.15857137837
R^2 score 0.8967156311766572
SHAP

Calculating Exact SHAP values implementation¶

$\phi_i \rightarrow$ Shapley value for feature $i$ (e.g.: {Age})
$f \rightarrow$ Black Box Model
$x \rightarrow$ Input data point
$z' \rightarrow$ Subset (e.g.: {Age, BMI})
$x' \rightarrow$ Simplified data input
Using a mapping function we transform $x \rightarrow x'$
$z'\subseteq x' \rightarrow$ Iterate over all possible combinations of features
$f_x(z') \rightarrow$ Black Box Model output for our input with the feature(s) we are interested in (e.g.:{Age, BMI})
$f_x(z' \backslash i) \rightarrow$ Black Box Model output for our input without the feature(s) we are interested in (e.g.:{BMI})
$[f_x(z') - f_x(z' \backslash i)] \rightarrow$ Tells us how feature $i$ contributed to that subset. Also called the marginal value
$M \rightarrow$ Total number of features
$\frac{|z'|!(M - |z'| - 1)!}{M!} \rightarrow$ Weighting according to how many players are in that correlation

$$\phi_i(f,x) = \sum_{z'\subseteq x'} \frac{|z'|!(M - |z'| - 1)!}{M!} [f_x(z') - f_x(z' \backslash i)]$$

In [18]:
from itertools import combinations
In [195]:
# amount of coalitions is (2^features)-1
def get_coalitions(sample, feature):
    rest_features = []
    # get the rest of the features
    for feat in sample.index:
        if feat != feature:
            rest_features.append(feat)
    # get all possible coalitions for every feature
    coalition_list = []
    for feat_num in range(len(x.index)):
        for coalition in combinations(rest_features,feat_num):
            coalition_list.append(list(coalition))
    
    return coalition_list
In [194]:
def get_val(model, background, sample, coalition):
    # get coalition feature values and map them to a dictionary
    coalition_features = {c: sample[c] for c in coalition} 
    # assign all coalition values, to whole dataset, predict, 
    # and get mean()
    return model.predict(background.assign(**coalition_features)).mean()
In [192]:
import scipy
from math import factorial
# val(S U i) - val(S)
def get_contributions(model, background, sample, feat, coalition):
    # val(S U i)
    val_s_i = get_val(model, background, sample, coalition + [feat])
    val_s = get_val(model, background, sample, coalition)
    ## get worth of coalition
    val = val_s_i - val_s
    ## get number of coalitions
    z = len(coalition) # number of present features
    M = len(sample.index) # number of total features
    num_coalitions = (factorial(z)*factorial(M - z - 1)) / factorial(M)
    ## result
    return num_coalitions * val    

# def coalition_contribution(model, X_train, x, col, coalition):
#     marginal_gain = get_val(model, X_train, x, coalition + [col]) - get_val(model, X_train, x, coalition)
#     num_coalitions = 1 / (scipy.special.comb(len(x) - 1, len(coalition)) * len(x))
#     return num_coalitions * marginal_gain  

# print('good', coalition_contribution(model, X_train, X.iloc[0:1], ['floors'], ['bathrooms']))
In [196]:
def calculateSHAP(model, background, X):
    shap_values = []
    #for every sample
    for i in range(np.shape(X)[0]):
        sample = X.iloc[i]
        # 1. calculate coalitions\
        # for every feature
        shap_vals_for_x = []
        for feat in X.columns:
            coalitions = get_coalitions(sample, feat)
            print(1)
            contributions = []
            for coalition in coalitions:
                #print(4)
                contributions.append(get_contributions(model, 
                                                    background,
                                                     sample, 
                                                     feat,
                                                     coalition))
            print(2)
            shap_val = np.sum(contributions)
            shap_vals_for_x.append(shap_val)
        shap_values.append(shap_vals_for_x)
        print('feature done ', i)
    phi0 = np.average(model.predict(background))
    return phi0, shap_values
        
In [197]:
explainer = shap.Explainer(model.predict, X_train)
shap_values = explainer(X.iloc[0:1])
In [188]:
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']] 
X_test = X_test[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']] 
model.fit(X_train, y_train,
         eval_set=(X_test, y_test),
         use_best_model=True,
         plot=True,
         verbose=True)
Custom logger is already specified. Specify more than one logger at same time is not thread safe.
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 346240.5472848	test: 346245.6459987	best: 346245.6459987 (0)	total: 4.06ms	remaining: 40.6s
200:	learn: 244950.9176752	test: 246191.8338167	best: 246191.8338167 (200)	total: 314ms	remaining: 15.3s
400:	learn: 236410.3781219	test: 238969.7724612	best: 238969.7724612 (400)	total: 619ms	remaining: 14.8s
600:	learn: 233077.2662404	test: 236673.2360873	best: 236673.2360873 (600)	total: 932ms	remaining: 14.6s
800:	learn: 230981.6648494	test: 235469.8422829	best: 235469.8422829 (800)	total: 1.26s	remaining: 14.4s
1000:	learn: 229458.6218974	test: 234757.4685653	best: 234757.4685653 (1000)	total: 1.56s	remaining: 14.1s
1200:	learn: 228309.3816388	test: 234302.1611602	best: 234295.8773599 (1193)	total: 1.87s	remaining: 13.7s
1400:	learn: 227275.1605411	test: 233762.1118315	best: 233762.1118315 (1400)	total: 2.17s	remaining: 13.3s
1600:	learn: 226459.5291490	test: 233429.8827411	best: 233429.8827411 (1600)	total: 2.5s	remaining: 13.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 233362.5671
bestIteration = 1654

Shrink model to first 1655 iterations.
Out[188]:
<catboost.core.CatBoostRegressor at 0x7fcc872d8fa0>
In [189]:
def coalition_worth(model, X_train, x, coalition):
    coalition_features = {c: x[c] for c in coalition} 
    return model.predict(X_train.assign(**coalition_features)).mean()

def coalitions(x, col):
    remaining_features = [feature for feature in x.index if feature != col]
    for feature in range(len(remaining_features) + 1):
        for coalition in combinations(remaining_features, feature):
            yield list(coalition)
            
def coalition_contribution(model, X_train, x, col, coalition):
    marginal_gain = coalition_worth(model, X_train, x, coalition + [col]) - coalition_worth(model, X_train, x, coalition)
    num_coalitions = 1 / (scipy.special.comb(len(x) - 1, len(coalition)) * len(x))
    return num_coalitions * marginal_gain  
            
def calculate_exact_shap_values(model, X_train, X_sample):
    if isinstance(X_sample, pd.Series):
        X_sample = pd.DataFrame(X_sample).T
    shap_values_list = []
    for i in range(X_sample.shape[0]):
        x = X_sample.iloc[i]
        shap_values = []
        for col in X_train.columns:
            shap_value =  np.sum([coalition_contribution(model, X_train, x, col, coalition) for coalition in coalitions(x, col)])
            shap_values.append(shap_value)
        shap_values_list.append(shap_values)
        print('feature finished', i)
    phi0 = np.average(model.predict(X_train))
    return phi0, shap_values_list

calculate_exact_shap_values(model, X_train.iloc[0:10], X.iloc[0])
feature finished 0
Out[189]:
(476313.84119890817,
 [[4475.584453349034,
   9320.233133025104,
   -162224.169014538,
   35598.95174850948]])
In [199]:
calculateSHAP(model, X_train.iloc[0:10], X.iloc[0:1])
1
2
1
2
1
2
1
2
feature done  0
Out[199]:
(476313.84119890817,
 [[4475.584453349034,
   9320.233133025104,
   -162224.169014538,
   35598.95174850948]])
In [ ]:
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']] 
In [184]:
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']] 
X = X[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']] 
In [186]:
X_train
Out[186]:
bedrooms bathrooms sqft_living sqft_lot
6351 3 2.50 1970 6291
8238 3 1.50 2230 6000
19251 5 2.50 2990 7292
4307 3 1.00 1130 7200
6360 3 2.50 3060 7831
... ... ... ... ...
13135 4 2.50 2450 5079
19661 3 2.50 1650 1793
9856 3 1.75 1460 12367
10810 3 1.00 1030 8395
2736 3 1.75 1310 18400

15120 rows × 4 columns

In [138]:
shap_values
Out[138]:
.values =
array([[  1176.72304471, -11652.07931896, -80612.04994778,
         -7140.81036387,   6051.33455908,  -1961.08864735,
         -8085.30680057,  -8853.88935721, -51810.33454926,
         -3627.74886706,   5709.54991419,   4656.60319149,
         -1102.35000946, -26616.71562021, -77728.02089429,
          2596.11671592, -32187.66915759,  -1688.80031901]])

.base_values =
array([513418.88552921])

.data =
array([[ 3.00000e+00,  1.00000e+00,  1.18000e+03,  5.65000e+03,
         1.00000e+00,  0.00000e+00,  0.00000e+00,  3.00000e+00,
         7.00000e+00,  1.18000e+03,  0.00000e+00,  1.95500e+03,
         0.00000e+00,  9.81780e+04,  4.75112e+01, -1.22257e+02,
         1.34000e+03,  5.65000e+03]])
In [149]:
model.predict(X.iloc[0:1])
Out[149]:
array([220542.34910198])
In [150]:
y[0]
Out[150]:
221900.0
In [13]:
np.shape(X)
Out[13]:
(21613, 18)
In [24]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X) # returns an array of shapley values
shap_values2 = explainer(X) # this returns a dictionary containing
# shapley values, data values, base values. Convenient for plotting
In [25]:
print('shap shape', np.shape(shap_values))
shap shape (21600, 18)
SHAP Local
In [26]:
# print data sample to be explained
sample = 10
print('data sample', sample)
data.iloc[sample,:]
data sample 10
Out[26]:
price            662500.0000
bedrooms              3.0000
bathrooms             2.5000
sqft_living        3560.0000
sqft_lot           9796.0000
floors                1.0000
waterfront            0.0000
view                  0.0000
condition             3.0000
grade                 8.0000
sqft_above         1860.0000
sqft_basement      1700.0000
yr_built           1965.0000
renovated             0.0000
zipcode           98007.0000
lat                  47.6007
long               -122.1450
sqft_living15      2210.0000
sqft_lot15         8925.0000
Name: 10, dtype: float64
In [27]:
# show shap values for the predicted target (1)
shap_values2[sample]
Out[27]:
.values =
array([  6445.6665504 ,    954.19074073, 177898.46191639,   9353.44427428,
         2995.15805205,  -6073.05016675, -10906.61842913, -11017.73548305,
       -17174.70422995,   5308.78035444, -90282.66135448, -29318.29415725,
        -2681.83657411,   6035.10626459, 139414.82771066, -31511.97459231,
        10523.58385478,  -5447.69598461])

.base_values =
536623.1089338156

.data =
array([ 3.00000e+00,  2.50000e+00,  3.56000e+03,  9.79600e+03,
        1.00000e+00,  0.00000e+00,  0.00000e+00,  3.00000e+00,
        8.00000e+00,  1.86000e+03,  1.70000e+03,  1.96500e+03,
        0.00000e+00,  9.80070e+04,  4.76007e+01, -1.22145e+02,
        2.21000e+03,  8.92500e+03])
In [28]:
print('Actual value = ', y[sample])
sample_pred = model.predict(X.iloc[sample:sample+1,:])
print('Predicted value = ', sample_pred)
Actual value =  662500.0
Predicted value =  [691137.75768051]
In [29]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[sample,:], 
                X_test.iloc[sample,:])
Out[29]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [30]:
shap.plots.waterfall(shap_values2[sample], max_display=9)
SHAP Global
In [31]:
shap.summary_plot(shap_values, X, max_display=22)
In [32]:
shap.summary_plot(shap_values2, X,plot_type="bar", max_display=22)